﻿import string
import requests
from lxml import etree
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, colors
from openpyxl.chart import LineChart, Reference
from datetime import date

heads = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
}

url_list = [
    'https://www.usa.gov/agency-index#A',
    'https://www.usa.gov/agency-index/b#B',
    'https://www.usa.gov/agency-index/c#C',
    'https://www.usa.gov/agency-index/d#D',
    'https://www.usa.gov/agency-index/e#E',
    'https://www.usa.gov/agency-index/f#F',
    'https://www.usa.gov/agency-index/g#G',
    'https://www.usa.gov/agency-index/h#H',
    'https://www.usa.gov/agency-index/i#I',
    'https://www.usa.gov/agency-index/j#J',
    'https://www.usa.gov/agency-index/k#K',
    'https://www.usa.gov/agency-index/l#L',
    'https://www.usa.gov/agency-index/m#M',
    'https://www.usa.gov/agency-index/n#N',
    'https://www.usa.gov/agency-index/o#O',
    'https://www.usa.gov/agency-index/p#P',
    'https://www.usa.gov/agency-index/q#Q',
    'https://www.usa.gov/agency-index/r#R',
    'https://www.usa.gov/agency-index/s#S',
    'https://www.usa.gov/agency-index/t#T',
    'https://www.usa.gov/agency-index/u#U',
    'https://www.usa.gov/agency-index/v#V',
    'https://www.usa.gov/agency-index/w#W',
    'https://www.usa.gov/agency-index/x#X',
    'https://www.usa.gov/agency-index/y#Y',
    'https://www.usa.gov/agency-index/z#Z'
]

url = 'https://www.usa.gov/agency-index/a#A'
letters = string.ascii_lowercase  # 获取所有小写字母
row_i = 0
row_j = 0


# 整理网址
def get_url_list():
    for letter in letters:
        url_with_hash = url.replace('a#A', letter)
        print(url_with_hash)
        url_list.append(url_with_hash)


# 批量填充数据
def set_value(first_list, second_list):
    wb = Workbook()
    sheet = wb.create_sheet("数据", 0)
    for i, data in enumerate(first_list):
        sheet.cell(i + 1, 1).value = data
    for j, url in enumerate(second_list):
        sheet.cell(j + 1, 2).value = url

    wb.save('../爬虫文件/A-Z.xlsx')


def print_name(url_param):
    # 访问网址
    index_result = requests.get(url=url_param, headers=heads)
    index_page_text = index_result.text
    html = etree.HTML(index_page_text)

    # 第一层
    first_list = []
    second_list = []
    print(url_param.split('#')[-1])
    leter_name = url_param.split('#')[-1]
    first_list.append(leter_name)
    first_list.extend(html.xpath('//div[@class="usa-accordion usagov-directory-accordion-container"]//h2/button/text()'))
    # print(len(first_list), first_list)
    for name in first_list:
        print(name.strip())

    # 第二层
    second_list.append(leter_name)
    second_list.extend(html.xpath('//div[@class="usa-accordion usagov-directory-accordion-container"]//div[1]/p[2]/a/@href'))
    print(len(second_list))
    for url_name in second_list:
        print(url_name)

    return first_list, second_list


if __name__ == "__main__":
    # 整理网址
    # get_url_list()
    first_list = []
    second_list = []
    for url in url_list:
        f_list, s_list = print_name(url)
        first_list.extend(f_list)
        second_list.extend(s_list)

    # 保存到 Excel 中
    set_value(first_list, second_list)
